###################################################################################################
# Reducing Model Misspecification and Bias in the Estimation of Interactions ######################
# Matthew Blackwell & Michael Olson ###############################################################
# Replication of Escriba-Folch, Meseguer, Wright (2018) ###########################################
###################################################################################################

###################################################################################################
# load required packages, and set seed for replication #####################
###################################################################################################

# packages

  require("ggplot2")
  require("lfe")
  require("inters")
  require("RColorBrewer")
  require("KRLS")
  require("lmtest")
  require("haven")
  require("BART")

# seed and number of bootstrap iterations

  set.seed(63116)
  n_boots <- 5000

# define a color
  
  skyblue <- rgb(236, 240, 241, max = 255)
  
###################################################################################################
# load and prepare replication data ###############################################################
###################################################################################################

# load replication data  
    
  emw <- read_dta("data/efmw_replication.dta")
  emw <- as.data.frame(emw)
  
# create two variables, following the authors' replication code

  emw$dist <- log(1 + (1 / (emw$dist_coast)))
  emw$distwremit <- log(1 + ( (emw$richremit / 1000000) * (emw$dist)))
  
# limit the data to relevant columns and complete cases of those variables
  
  emw <- emw[, c("Protest", "remit", "dict", "l1gdp", "l1pop", "l1nbr5", "l12gr",
                 "l1migr", "elec3", "cowcode", "period", "distwremit",
                 "caseid", "year")]
  emw <- na.omit(emw)
  
# now we create some matrices, including model matrices for the fixed effects,
# for the methods that don't just take a typical data frame/formula
  
# control variables
  
  controls <- c("l1gdp", "l1pop", "l1nbr5", "l12gr", "l1migr", "elec3")

# model matrix for the fixed effects. as is described in the paper, we use
# a sum coding for the fixed effects so there's no single omitted unit/time period
 
  contr.list <- list(contr.sum, contr.sum)
  names(contr.list) <- c("factor(period)","factor(cowcode)")
  mod_mat <- model.matrix(~factor(period)+factor(cowcode),data=emw,contrasts.arg=contr.list)[,-1]
  
# now make a matrix of all the controls (with fixed effects)

  X <- as.matrix(cbind(emw[,controls],mod_mat))

# and a matrix of those interacted with the moderator

  XV <- as.matrix(X*emw$dict)
  
# and then bind all of the right-hand side variables together

  big_x <- as.matrix(cbind(d = emw$remit, d_V = emw$remit*emw$dict, V = emw$dict, X = X, XV = XV))
  small_x <- cbind(d = emw$remit, V = emw$dict, X = X)

###################################################################################################
# replicate models from paper #####################################################################
###################################################################################################  
  
# this is just to demonstrate that we've accurately reflected the data coding process insofar 
# as we can exactly reproduce the findings from the original paper
  
  tab1_col1 <- felm(Protest~remit+dict+
                      l1gdp+l1pop+l1nbr5+l12gr+l1migr+elec3
                    |period+cowcode|0|caseid,
                    data=emw)
      
  tab1_col2 <- felm(Protest~remit*dict+
                      l1gdp+l1pop+l1nbr5+l12gr+l1migr+elec3
                    |period+cowcode|0|caseid,
                    data=emw)

###################################################################################################
# estimate our various models #####################################################################
###################################################################################################  
  
# single interaction model ########################################################################
# this is actually just the second model above, from the original paper
  
  int <- tab1_col2

# fully moderated model ###########################################################################
  
  allint <- felm(Protest~dict*(remit+l1gdp+l1pop+l1nbr5+l12gr+l1migr+
                                 elec3+factor(period)+factor(cowcode))|0|0|caseid,
                                             data=emw)
  
# post double-selection ###########################################################################
  
  post_lasso <- post_ds_interaction(data = emw, treat = "remit", moderator = "dict",
                                  outcome = "Protest", control_vars = controls,
                                  panel_vars = c("cowcode", "period"),
                                  cluster = "caseid")
  post_lasso_vcov <- post_lasso$vcv
  
# post single-selection ###########################################################################
  
  post_single_lasso <- post_ds_interaction(data = emw, treat = "remit", moderator = "dict",
                                    outcome = "Protest", control_vars = controls,
                                    panel_vars = c("cowcode", "period"),
                                    cluster = "caseid",
                                    method="single selection")
  post_single_lasso_vcov <- post_single_lasso$vcv
  
# adaptive lasso ##################################################################################
  
  # first we need to get the weights for the lasso from a ridge regression
  
    ridge_out <- glmnet::cv.glmnet(x = big_x, y = c(emw$Protest), alpha = 0)
    ridge_coefs <- as.numeric(coef(ridge_out, s = "lambda.1se"))[-1]
    alasso_pens <- 1 / abs(ridge_coefs)
    alasso_pens[1:(ncol(X) + 3)] <- 0 # we force all the base terms, treatment, moderator, 
                                      # and the interaction of interest to stay in
    
  # now estimate the lasso with these weights
    
    alasso_out <- glmnet::cv.glmnet(x = big_x, y = c(emw$Protest), penalty.factor = alasso_pens)
    alasso_coefs <- as.numeric(coef(alasso_out, s = "lambda.1se"))[-1]
    names(alasso_coefs) <- colnames(big_x)
    
  # now bootstrap confidence intervals

    # storage 
    
      alasso_boots <- matrix(NA, nrow = n_boots, ncol = 3)
      
    # now bootstrap
  
      for (b in 1:n_boots) {
        if (b %% 100 == 0) cat(b, "...\n")
        # we block on caseid, which is the clustering variable in teh original analysis
        
        caseids <- sample(unique(emw$caseid),length(unique(emw$caseid)),replace=T)
        
        bit <- list()
        
        for(j in 1:length(unique(emw$caseid))){
          
          bit[[j]] <- emw[emw$caseid==caseids[j],]
          
        }
        
        # assemble the bootstrap data
        
        bootdat <- do.call(rbind,bit)
        
        mod_mat_boot <- model.matrix(~factor(period)+factor(cowcode),data=bootdat,contrasts.arg=contr.list)[,-1]
        
        X_boot <- as.matrix(cbind(bootdat[,controls],mod_mat_boot))
        XV_boot <- as.matrix(X_boot*bootdat$dict)
        
        big_x_boot <- as.matrix(cbind(d = bootdat$remit, d_V = bootdat$remit*bootdat$dict, V = bootdat$dict, X = X_boot, XV = XV_boot))
        
        # re-estimate the adaptive lasso on the bootstrapped data
        
        ridge_boot <- glmnet::glmnet(x = big_x_boot, y = c(bootdat$Protest), alpha = 0, lambda = ridge_out$lambda.1se)
        ridge_coefs_boot <- as.numeric(coef(ridge_boot))[-1]
        alasso_pens <- 1 / abs(ridge_coefs_boot)
        alasso_pens[1:(ncol(X_boot) + 3)] <- 0
        alasso_boot <- glmnet::glmnet(x = big_x_boot, y = c(bootdat$Protest), penalty.factor = alasso_pens,
                                      lambda = alasso_out$lambda.1se)
        alasso_coefs_boot <- as.numeric(coef(alasso_boot))[-1]
        names(alasso_coefs_boot) <- colnames(big_x_boot)
        
        # store the results--we want both the treatment, the interaction, and the marginal effect for when moderator=1
        
        alasso_boots[b, ] <- c(alasso_coefs_boot[c("d", "d_V")],sum(alasso_coefs_boot[c("d", "d_V")]))

      }
  
# kernel regularized least squares ################################################################
      
  krls_out <- KRLS::krls(y = c(emw$Protest), X = small_x,derivative = F)
  small_x_pred <- rbind(cbind(d = mean(small_x[,1])+0.5, V = 1, small_x[, -c(1,2)]),
                        cbind(d = mean(small_x[,1])-0.5, V = 1, small_x[, -c(1,2)]),
                        cbind(d = mean(small_x[,1])+0.5, V = 0, small_x[, -c(1,2)]),
                        cbind(d = mean(small_x[,1])-0.5, V = 0, small_x[, -c(1,2)]))
  preds <- predict(krls_out, newdata = small_x_pred, se = TRUE)
  n_units <- nrow(small_x)
  ws <- c(1 / n_units, -1 / n_units, -1 / n_units, 1 / n_units)
  h <- matrix(rep(ws, each = n_units), ncol = 1)
  krls_int <- as.vector(t(h) %*% preds$fit)
  krls_intse <- as.vector(sqrt(t(h)%*%preds$vcov.fit%*%h))*sqrt(4)
  
  small_x_pred2 <- rbind(cbind(mean(small_x[,1])+0.5,V=0,small_x[, -c(1,2)]),
                         cbind(mean(small_x[,1])-0.5,V=0,small_x[, -c(1,2)]))
  preds <- predict(krls_out, newdata = small_x_pred2, se = TRUE)
  n_units <- nrow(small_x)
  ws <- c(1 / n_units, -1 / n_units)
  h <- matrix(rep(ws, each = n_units), ncol = 1)
  krls_main <- as.vector(t(h) %*% preds$fit)
  krls_mainse <- as.vector(sqrt(t(h)%*%preds$vcov.fit%*%h))*sqrt(2)
  
# bayesian additive regression trees #############################################################
  
  small_x_bart <- rbind(
    c(d = mean(small_x[,1],na.rm=T)+0.5, V = 1, colMeans(X)),
    c(d = mean(small_x[,1],na.rm=T)-0.5, V = 1, colMeans(X)),
    c(d = mean(small_x[,1],na.rm=T)+0.5, V = 0, colMeans(X)),
    c(d = mean(small_x[,1],na.rm=T)-0.5, V = 0, colMeans(X))
  )

  bart_out <- BART::wbart(small_x, emw$Protest, ndpost = 80000, nskip=20000,
                            x.test = small_x_bart)

  bart_main <- (bart_out$yhat.test[, 3] - bart_out$yhat.test[, 4])
  bart_main_ci <- quantile(bart_main, probs = c(0.025, 0.975))
  bart_main <- mean(bart_main)
  
  bart_int <- (bart_out$yhat.test[, 1] - bart_out$yhat.test[, 2] -
    bart_out$yhat.test[, 3] + bart_out$yhat.test[, 4] )
  bart_int_ci <- quantile(bart_int, probs = c(0.025, 0.975))
  bart_int <- mean(bart_int)
  bart_cov <- as.numeric(bart_int_ci[1] < 1 &
                               bart_int_ci[2] > 1)
      
      
###################################################################################################
# make figures ####################################################################################
###################################################################################################  

# create a dataframe with relevant coefficients

# start with the estimators that produce standard errors
  
  coefficients <- as.data.frame(rbind(summary(int)$coefficients[c("remit","remit:dict"),1:2],
                                      summary(allint)$coefficients[c("remit","dict:remit"),1:2],
                                      coeftest(post_lasso,post_lasso_vcov)[c("remit","remit_dict"),1:2],
                                      coeftest(post_single_lasso,post_single_lasso_vcov)[c("remit","remit_dict"),1:2],
                                      c(krls_main,krls_mainse),
                                      c(krls_int,krls_intse)
  ))
  
  # make 95% confidence intervals
  
  coefficients$lower_ci <- coefficients[,1]-1.96*coefficients[,2]
  coefficients$upper_ci <- coefficients[,1]+1.96*coefficients[,2]
  
  coefficients$`Cluster s.e.` <- NULL
  
  # now add BART and adaptive lasso, which just had CI to begin with
  
  alasso_chunk <- as.data.frame(cbind(c(alasso_coefs[1:2],sum(alasso_coefs[1:2])),t(apply(alasso_boots,2,function(x) quantile(x, probs = c(0.025, 0.975))))))
  colnames(alasso_chunk) <- c("Estimate","lower_ci","upper_ci"); rownames(alasso_chunk) <- NULL
  
  coefficients <- rbind(coefficients,
                        c(bart_main,bart_main_ci),
                        c(bart_int,bart_int_ci),
                        alasso_chunk
  )  
      
  # drop rownames, add information on estimators and QOI
  
  rownames(coefficients) <- NULL
  
  coefficients$Method <- c("Single Interaction","Single Interaction",
                           "Fully Moderated","Fully Moderated",
                           "Post-Double Selection","Post-Double Selection",
                           "Post-Lasso","Post-Lasso",
                           "KRLS","KRLS","BART","BART","ALasso","ALasso","ALasso")
  
  coefficients$QOI       <- c(rep(c("Marginal Effect, Democracy","Interaction"),7),"Marginal Effect, Autocracy")
      
# now we make and add the marginal effect in the South
  
  margeff_dict_simple <- sum(coefficients$Estimate[coefficients$Method=="Single Interaction"])
  margeff_dict_full   <- sum(coefficients$Estimate[coefficients$Method=="Fully Moderated"])
  margeff_dict_pl   <- sum(coefficients$Estimate[coefficients$Method=="Post-Double Selection"])
  
  margeff_dict_simple_se <- sqrt(int$clustervcv["remit","remit"]+
                                    int$clustervcv["remit:dict","remit:dict"]+
                                    2*int$clustervcv["remit:dict","remit"])
  
  margeff_dict_full_se <- sqrt(allint$clustervcv["remit","remit"]+
                                  allint$clustervcv["dict:remit","dict:remit"]+
                                  2*allint$clustervcv["dict:remit","remit"])
  
  margeff_dict_pl_se <- sqrt(post_lasso_vcov["remit","remit"]+
                                post_lasso_vcov["remit_dict","remit_dict"]+
                                2*post_lasso_vcov["remit_dict","remit"])
  
  margeff_simple_row <- c(margeff_dict_simple,margeff_dict_simple-1.96*margeff_dict_simple_se,margeff_dict_simple+1.96*margeff_dict_simple_se)
  margeff_full_row <- c(margeff_dict_full,margeff_dict_full-1.96*margeff_dict_full_se,margeff_dict_full+1.96*margeff_dict_full_se)
  margeff_pl_row <- c(margeff_dict_pl,margeff_dict_pl-1.96*margeff_dict_pl_se,margeff_dict_pl+1.96*margeff_dict_pl_se)
  
  margeff_dict_coefs <- as.data.frame(rbind(margeff_simple_row,margeff_full_row,margeff_pl_row))
  colnames(margeff_dict_coefs) <- c("Estimate","lower_ci","upper_ci"); rownames(margeff_dict_coefs) <- NULL
  margeff_dict_coefs$QOI <- "Marginal Effect, Autocracy"; margeff_dict_coefs$Method <- c("Single Interaction","Fully Moderated","Post-Double Selection")
  
  coefficients <- rbind(coefficients,margeff_dict_coefs)
  
  coefficients$Method <- factor(coefficients$Method,levels=c("Single Interaction","Fully Moderated","Post-Lasso","Post-Double Selection","ALasso","BART","KRLS"))
  coefficients$QOI <- factor(coefficients$QOI,levels=c("Marginal Effect, Democracy","Marginal Effect, Autocracy","Interaction"))
  
  
# set the theme of the figure
  
  theme_set(theme_minimal() +
              theme(panel.border = element_rect(fill = NA)))
  theme_cousteau <- theme_light() +
    theme(plot.background = element_rect(fill = skyblue, color = skyblue),
          legend.background = element_rect(fill = skyblue))
  
  hues <- seq(15, 375, length = 7 + 1)
  cols <-  hcl(h = hues, l = 65, c = 100)[c(1, 2, 2, 3, 4, 5, 6, 7)]
  names(cols) <- c("Post-Double Selection", "Post-Lasso", "ALasso", "Oracle", "Fully Moderated",
                   "KRLS", "BART", "Single Interaction")
  shapes <- c(15, 18, 18, 3, 17, 7, 19, 9)
  names(shapes) <- c("Post-Double Selection", "Post-Lasso", "ALasso", "Oracle", "Fully Moderated",
                     "KRLS", "BART", "Single Interaction")
  
# make figures
  
  cairo_pdf("figures/remittances_coefplot.pdf", width = 8, height = 4, family = "Verdana")
  print(
    ggplot(coefficients[is.element(coefficients$Method,c("Single Interaction",
                                                         "Fully Moderated",
                                                         "Post-Double Selection",
                                                         "ALasso")),], aes(x = QOI, y = Estimate, group = Method,
                                                                           colour = Method, pch = Method)) +
      geom_hline(yintercept = 0, linetype = 2, size = 1, colour = "black") +
      geom_point(size = 3, position = position_dodge(width = 0.5)) +
      geom_errorbar(size = 0.5, aes(ymin = lower_ci,
                                    ymax = upper_ci),
                    position = position_dodge(width = 0.5), width = 0) +
      xlab("Quantity of Interest") +
      labs(color = "Method", shape = "Method") +
      scale_color_manual(values = cols[c("Single Interaction","Fully Moderated","Post-Double Selection","ALasso")]) +
      scale_shape_manual(values = shapes[c("Single Interaction","Fully Moderated","Post-Double Selection","ALasso")]) +
      theme(axis.title.y = element_text(angle = 0, vjust = 0.5, hjust = 0.5)) 
  )
  dev.off()
  
  
  
  cairo_pdf("figures/remittances_coefplot_additional.pdf", width = 8, height = 4, family = "Verdana")
  print(
    ggplot(coefficients[is.element(coefficients$Method,c("Post-Lasso",
                                                         "BART",
                                                         "KRLS")) & coefficients$QOI!="Marginal Effect, Autocracy",], 
           aes(x = QOI, y = Estimate, group = Method,
               colour = Method, pch = Method)) +
      geom_hline(yintercept = 0, linetype = 2, size = 1, colour = "black") +
      geom_point(size = 3, position = position_dodge(width = 0.5)) +
      geom_errorbar(size = 0.5, aes(ymin = lower_ci,
                                    ymax = upper_ci),
                    position = position_dodge(width = 0.5), width = 0) +
      xlab("Quantity of Interest") +
      labs(color = "Method", shape = "Method") +
      scale_color_manual(values = cols[c("Post-Lasso","BART","KRLS")]) +
      scale_shape_manual(values = shapes[c("Post-Lasso","BART","KRLS")]) +
      theme(axis.title.y = element_text(angle = 0, vjust = 0.5, hjust = 0.5)) 
  )
  dev.off()
  
  
  
